* This do-file reads in raw QCEW data and saves a clean version of employment for sample industries at the month by CZ level.
* Author: Monica Essig Aberg 

* ------------------------------------------------------------------------------
* Get QCEW data
* Monica Essig Aberg
* 1 February 2024
* Updated: 19 March 2024
* ------------------------------------------------------------------------------
* Outline
	* a. Download from QCEW 
	* b. Create QCEW CZ-quarter dataset 
	* c. Create QCEW CZ-month dataset

* ------------------------------------------------------------------------------

* a. Download from QCEW --------------------------------------------------------
/*
	* Download for nine 3 digit industries of interest

		* For last five years, quick download by industry
		foreach year of numlist 2019(1)2023 {
		foreach ind in 561 452 722 445 448 492 454 444 451 {
			noisily di "`year' `ind'"
			cap loadqcew naics, years(`year') agglvl(75) frequency(qtrly) industry(`ind') saving("$data/qcew/raw/qcew_`year'_`ind'")
			noisily di _rc
			noisily di "done"
		}
		}
		
		
	* For earlier years, download whole year at once
		foreach year of numlist 2013(1)2018 {
			noisily di "`year'"
			cap loadqcew naics, years(`year') agglvl(75) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'")
			noisily di _rc
			noisily di "done"
		}
		
	* Download employment by 2 digit NAICS codes

		foreach year of numlist 2013(1)2022 {
			noisily di "`year'"
			cap loadqcew naics, years(`year') agglvl(74) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'_2d")
			noisily di _rc
			noisily di "done"
		}
		
	* Download total employment

		foreach year of numlist 2013(1)2022 {
			noisily di "`year'"
			cap loadqcew naics, years(`year') agglvl(71) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'_agg")
			noisily di _rc
			noisily di "done"
		}
		
	* Do all separately for 2023: in loadqcew, find and replace q1-q4 with q1-q2 (or q3 when available), run loadqcew, and revert loadqcew after running to avoid overriding
	
		foreach year of numlist 2023 {
			noisily di "`year' `ind'"
			loadqcew naics, years(`year') agglvl(75) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'")
			noisily di _rc
			noisily di "done"
		}
		foreach year of numlist 2023 {
			noisily di "`year' `ind'"
			loadqcew naics, years(`year') agglvl(74) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'_2d")
			noisily di _rc
			noisily di "done"
		}
		foreach year of numlist 2023 {
			noisily di "`year' `ind'"
			loadqcew naics, years(`year') agglvl(71) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'_agg")
			noisily di _rc
			noisily di "done"
		}
*/
* b. Create QCEW CZ-quarter dataset --------------------------------------------

	// Quarterly average weekly wage values are calculated by dividing quarterly 
	// total wages by the average of the three monthly employment levels and 
	// dividing the result by thirteen.

	* Load raw data
	clear
	foreach year of numlist 2019(1)2022 {
	foreach ind in 561 452 722 445 448 492 454 444 451 {
		cap append using "$data/qcew/raw/qcew_`year'_`ind'", force
	}
	}
	foreach year of numlist 2013(1)2018 2023 {
		append using "$data/qcew/raw/qcew_`year'", force
	}
	foreach year of numlist 2013(1)2023 {
		append using "$data/qcew/raw/qcew_`year'_agg", force
	}
	
	* Keep industries of interest and total
	destring industry_code, replace
	keep if inlist(industry_code, 561, 452, 722, 445, 448, 492, 454, 444, 451, 10)
	
	* Get average employment
	gen avg_emp = (month1_emplvl+month2_emplvl+month3_emplvl)/3
	
	* Keep necessary vars
	keep industry_code area_fips area_title qtr year total_qtrly_wages avg_wkly_wage avg_emp

	* Collapse to county industry quarter level
	collapse (rawsum) total_qtrly_wages avg_emp (mean) avg_wkly_wage [pw = avg_emp], by(industry_code area_title area_fips qtr year)

	* Format date
	gen qdate = yq(year, qtr)
	format qdate %tq

	* Crosswalk to CZ
	destring area_fips, gen(cty_fips)
	merge m:1 cty_fips using "$data/crosswalks/cw_cty_czone"
	keep if _merge == 3
	drop _merge

	* Collapse to CZ industry quarter level
	gen total = industry_code == 10
	collapse (rawsum) total_qtrly_wages avg_emp (mean) avg_wkly_wage [pw = avg_emp], by(qtr year qdate czone total)
	
	* Reshape total employment
	reshape wide total_qtrly_wages avg_wkly_wage avg_emp, i(qtr year qdate czone) j(total)
	rename *0 *_all
	rename *1 *_main_ind

	* Save
	save "$data/qcew/qcew_wage_cz.dta", replace
	
* c. Create QCEW CZ-month dataset ----------------------------------------------
	
	* Retail first
	clear
	foreach year of numlist 2013/2023 {
		append using "$data/qcew/raw/qcew_`year'_2d"
	}

	keep if own_title == "Private"

	keep if industry_code == "44-45"

	gen missing_qcew_retail = month1 == 0 & month2 == 0 & month3 == 0

	rename month*_emplvl emp*
	reshape long emp, i(industry_code area_title area_fips qtr year missing) j(month)
	replace month = month+(qtr-1)*(3)

	* Format date
	gen mdate = ym(year, month)
	format mdate %tm

	* Crosswalk to CZ
	destring area_fips, gen(cty_fips)
	merge m:1 cty_fips using "$data/crosswalks/cw_cty_czone"
	keep if _merge == 3
	drop _merge

	collapse (sum) emp (max) missing, by(qtr year month mdate czone)
	
	tab missing
	// missing data in 8% of CZs
	
	rename emp emp_retail
	
	tempfile retail
	save `retail', replace
	
	* Load raw data
	clear
	foreach year of numlist 2019(1)2023 {
	foreach ind in 561 452 722 445 448 492 454 444 451 {
		cap append using "$data/qcew/raw/qcew_`year'_`ind'", force
	}
	}
	foreach year of numlist 2013(1)2018 2023 {
		append using "$data/qcew/raw/qcew_`year'", force
	}
	foreach year of numlist 2013(1)2023 {
		append using "$data/qcew/raw/qcew_`year'_agg", force
	}
		
	* Keep industries of interest and total
	destring industry_code, replace
	keep if inlist(industry_code, 561, 452, 722, 445, 448, 492, 454, 444, 451, 10)
	keep if own_title == "Private"
	
	* Keep necessary vars
	keep industry_code area_fips area_title qtr year month*
	gen missing_qcew_data = month1 == 0 & month2 == 0 & month3 == 0

	* Sum employment
	collapse (sum) month* (max) missing_qcew_data, by(industry_code area_title area_fips qtr year)
	
	* Reshape long
	rename month*_emplvl emp*
	reshape long emp, i(industry_code area_title area_fips qtr year missing) j(month)
	replace month = month+(qtr-1)*(3)

	* Format date
	gen mdate = ym(year, month)
	format mdate %tm

	* Crosswalk to CZ
	destring area_fips, gen(cty_fips)
	merge m:1 cty_fips using "$data/crosswalks/cw_cty_czone"
	keep if _merge == 3
	drop _merge

	* Collapse
	gen total = industry_code == 10
	collapse (sum) emp (max) missing, by(qtr year month mdate czone total)
	
	* Reshape total employment
	reshape wide emp missing, i(qtr year month mdate czone) j(total)
	rename emp1 emp_all
	rename missing*1 missing_qcew_all
	rename emp0 emp_main_ind
	rename missing*0 missing_qcew_main_ind
	
	* Merge with retail
	merge 1:1 mdate czone using `retail', nogen

	* Save
	save "$data/qcew/qcew_emp_cz.dta", replace

	
	* QCEW analysis dataset
		
	import excel using "$data/equifax/events_qcew.xlsx", first clear
	
	foreach var in qdate qdate_pre qdate_post{
	rename `var' `var'_string
	gen `var' = quarterly(`var'_string, "YQ")
	format `var' %tq
	drop `var'_string
	}
	
	save "$data/equifax/events_qcew.dta", replace
	
	clear all

* a. Download from QCEW --------------------------------------------------------

* increase default internet timeout wait times to avoid getting error r(2)
set timeout1 180
set timeout2 180
		
* Aggregation levels codes: 74 (2 digit), 75 (3 digit), 76 (4 digit), 77 (5 digit), 78 (6 digit)

foreach year of numlist 2013(1)2023 {
    foreach agglevel of numlist 74(1)78 {
		
        if `agglevel' == 74 {
            local naics_digits "2d"
        } 
		else if `agglevel' == 75 {
            local naics_digits "3d"
        } 
		else if `agglevel' == 76 {
            local naics_digits "4d"
        } 
		else if `agglevel' == 77 {
            local naics_digits "5d"
        } 
		else if `agglevel' == 78 {
            local naics_digits "6d"
        }

        noisily di "`year' `agglevel'"
        cap loadqcew naics, years(`year') agglvl(`agglevel') frequency(qtrly) saving("$data/qcew/raw/qcew_`year'_`naics_digits'")
        noisily di _rc
        noisily di "done"
    }
}


* Download total employment by county

foreach year of numlist 2013(1)2023 {
	noisily di "`year'"
	cap loadqcew naics, years(`year') agglvl(71) frequency(qtrly) saving("$data/qcew/raw/qcew_`year'_agg")
	noisily di _rc
	noisily di "done"
		}

* b. Create QCEW CZ-quarter dataset --------------------------------------------

clear

	 // Load and clean raw data one by one as appending everything before would 
	 // not be computationally feasible
	 
foreach year of numlist 2013(1)2023 {
    foreach ind_digits of numlist 1(1)6 {
		
		if `ind_digits' == 1 {
            use "$data/qcew/raw/qcew_`year'_agg", clear // aggregate by all industries
        } 
		else {
            use "$data/qcew/raw/qcew_`year'_`ind_digits'd", clear
        }  

		* Get average employment
		gen avg_emp = (month1_emplvl+month2_emplvl+month3_emplvl)/3
		
		* Keep necessary vars
		keep industry_code area_fips area_title qtr year total_qtrly_wages avg_wkly_wage avg_emp

		* Collapse to county industry quarter level (we are summing over ownership title)
		collapse (rawsum) total_qtrly_wages avg_emp (mean) avg_wkly_wage [pw = avg_emp], by(industry_code area_title area_fips qtr year)

		* Format date
		gen qdate = yq(year, qtr)
		format qdate %tq

		* Crosswalk to CZ
		* notice that some counties do not merge because they are in US territories or are undefined
		qui gen state_fips = substr(area_fips,1,2)
		qui destring area_fips, gen(cty_fips)
		qui merge m:1 cty_fips using "$data/crosswalks/cw_cty_czone"
		
		* tabulate those areas that couldn't be merged
		gen undefined_county = 0
		replace undefined_county = 1 if substr(area_fips,3,5) == "999"
		if `ind_digits' == 1 {
			di "Observations in year `year' with _merge == 1"
			tab state_fips undefined_county if _merge == 1
	        * tabulate counties that are not undefined nor in Puerto Rico or US territories
			tab area_title if _merge == 1 & undefined_county == 0 & state_fips != "72" & state_fips != "78" 
			di "Observations in year `year' with _merge == 2"
			*tostring cty_fips, gen(cty_fips_str)
			*replace cty_fips_str = "0" + cty_fips_str if strlen(cty_fips_str) == 4
			*gen state_fips_using_data = substr(cty_fips_str)
			tab cty_fips if _merge == 2
		}
		
		
		* only keep counties that were merged successfully
		keep if _merge == 3
		drop _merge
			
		qui save "$data/qcew/intermediate/qcew_cz_`year'_`ind_digits'd", replace
	
    }
}

// Append files and then merge counties with commuting zones

	use "$data/qcew/intermediate/qcew_cz_2013_1d", clear

	foreach year of numlist 2013(1)2023 {
		foreach ind_digits of numlist 1(1)6 {
		
			if !(`year' == 2013 & `ind_digits' == 1) {
				append using "$data/qcew/intermediate/qcew_cz_`year'_`ind_digits'd"
			}
		}
	}

	* Collapse to CZ industry quarter level
	gen cz_total = industry_code == "10" // 10 is "Total, all industries"
	collapse (rawsum) total_qtrly_wages avg_emp (mean) avg_wkly_wage [pw = avg_emp], by(qtr year qdate czone industry_code cz_total)
	
	* Create variable with number of naics digits
	replace industry_code = "31" if industry_code == "31-33"
	gen naics_digits = strlen(industry_code)

	* Count number of commuting zones per industry
	egen num_czones = count(czone), by(naics_digits industry_code year qdate)
	
	* Create separate variables for total county 
	preserve
	tempfile total_cz_vars
	keep if industry_code == "10"
	rename total_qtrly_wages cz_total_qtrly_wages
	rename avg_emp cz_avg_emp 
	rename avg_wkly_wage cz_avg_wkly_wage
	rename num_czones usa_czones
	keep qdate cz* usa_czones
	save `total_cz_vars', replace
	restore 
	
	drop if industry_code == "10"
	merge m:1 qdate czone using `total_cz_vars'
	drop _merge
	
	label var num_czones "Number of commuting zones with data for quarter-year-industry"
	label var naics_digits "Level of aggregation of industry NAICS classification"
	
	label var total_qtrly_wages "Total wages by quarter-industry-czone"
	label var avg_emp "Average number of employees by quarter-industry-czone"
	label var avg_wkly_wage "Average weekly wage by quarter-industry-czone"
	
	label var cz_total_qtrly_wages "Total wages by quarter-czone"
	label var cz_avg_emp "Average number of employees by quarter-czone"
	label var cz_avg_wkly_wage "Average weekly wage by quarter-czone"
	
	label var usa_czones "Number of commuting zones successfully merged that year"
	
	save "$data/qcew/qcew_cz_all_industries.dta", replace
